
@article{Lee2002656,
author = "ShinJae Lee and Minsoo Jeon and Dongseung Kim and Andrew Sohn",
title = "Partitioned Parallel Radix Sort",
journal = "Journal of Parallel and Distributed Computing",
volume = "62",
number = "4",
pages = "656 - 668",
year = "2002",
}

@article{Sintorn20081381,
author = "Erik Sintorn and Ulf Assarsson",
title = "Fast parallel {GPU}-sorting using a hybrid algorithm",
journal = "Journal of Parallel and Distributed Computing",
volume = "68",
number = "10",
pages = "1381 - 1388",
year = "2008",
}

@inproceedings{bitonic_orig,
 author = {Batcher, K. E.},
 title = {Sorting networks and their applications},
 booktitle = {Proceedings of the Joint Computer Conference},
 series = {AFIPS '68 (Spring)},
 year = {1968},
 location = {Atlantic City, New Jersey},
} 

@TECHREPORT{blelloch,
            author = {Blelloch, G. and Dagum, L. and Smith, S. and Thearling, K. and Zagha, M.},			
            title = {An Evaluation of Sorting as a Supercomputer Benchmark},
            institution = {NASA},
            year = {1993},
            url = {http://www.thearling.com/text/nasa/nasa.htm}			
}

@incollection{bitonic_gpugems,
author = {Ian Buck and Tim Purcell},
title = {A Toolkit for Computation on {GPUs}},
booktitle = {{GPU Gems}},
chapter = 37,
year = 2004
}

@article{iesproadmap,
    title = {International Exascale Software Roadmap},
    author = {Dongarra, J. and Beckman, P. and others},
    journal = {International Journal of High Performance Computing Applications},
    volume = 25,
    number = 1,
    year = 2011,
}

@inproceedings{gputerasort,
 author = {Govindaraju, Naga and Gray, Jim and Kumar, Ritesh and Manocha, Dinesh},
 title = {{GPUTeraSort}: high performance graphics co-processor sorting for large database management},
 booktitle = {Proceedings of the 2006 ACM SIGMOD International Conference on Management of Data},
 series = {SIGMOD '06},
 year = {2006},
 pages = {325--336},
 numpages = {12},
 publisher = {ACM},
 address = {New York, NY, USA},
} 

@inproceedings{huang_chow_sample_sort_1983,
author = {Huang, J. S. and Chow, Y. C.},
title = "Parallel sorting and data partitioning by sampling",
year = 1983,
booktitle = "Proceedings of the 7th International Computer Software and Applications Conference"
}

@article{bitonic_kim,
author = {Yong Cheol Kim and Minsoo Jeon and Dongseung Kim and Andrew Sohn},
title = {Communication-Efficient Bitonic Sort on a Distributed Memory Parallel Computer},
journal ={International Conference on Parallel and Distributed Systems},
volume = {0},
year = {2001},
pages = {165},
publisher = {IEEE Computer Society},
address = {Los Alamitos, CA, USA},
}

@incollection{radix_lee,
   author = {Lee, Shin-Jae and Jeon, Minsoo and Sohn, Andrew and Kim, Dongseung},
   title = {Partitioned Parallel Radix Sort},
   booktitle = {High Performance Computing},
   series = {Lecture Notes in Computer Science},
   publisher = {Springer Berlin / Heidelberg},
   pages = {160-171},
   volume = {1940},
   year = {2000}
}

@INPROCEEDINGS{5470444,
author={Leischner, N. and Osipov, V. and Sanders, P.},
booktitle={2010 IEEE International Symposium on Parallel Distributed Processing (IPDPS)}, 
title={{GPU} sample sort},
year={2010},
}

@TECHREPORT{duane,
            author = {Merrill, Duane and Grimshaw, A.},
            title = {Revisiting Sorting for {GPGPU} Stream Architectures},
            institution = {University of Virginia},
            year = {2010},
            url = {http://www.cs.virginia.edu/~dgm4d/}			
}

@TECHREPORT{merrillscan,
            author = {Merrill, Duane and Grimshaw, A.},
            title = {Parallel Scan for Stream Architectures},
            institution = {University of Virginia},
            year = {2009},
            url = {http://www.cs.virginia.edu/~dgm4d/}			
}

@article{bitonic_parallel,
author = {D. Nassimi and S. Sahni},
title = {Bitonic Sort on a Mesh-Connected Parallel Computer},
journal ={IEEE Transactions on Computers},
volume = {28},
year = {1979},
pages = {2-7},
address = {Los Alamitos, CA, USA},
}

@inproceedings{Nodine:1993:DDS:165231.165247,
 author = {Nodine, Mark H. and Vitter, Jeffrey Scott},
 title = {Deterministic distribution sort in shared and distributed memory multiprocessors},
 booktitle = {Proceedings of the Fifth Annual ACM Symposium on Parallel Algorithms and Architectures},
 series = {SPAA '93},
 year = {1993}, 
 location = {Velen, Germany},
 pages = {120--129},
} 

@article{reif_logarithmic_1987,
	title = {A logarithmic time sort for linear size networks},
	volume = {34},
	number = {1},
	journal = {Journal of the {ACM} {(JACM)}},
	author = {Reif, J. H and Valiant, L. G},
	year = {1987},
	pages = {60--76}
}

@article{shi_regularsampling_1992,
author = "Hanmao Shi and Jonathan Schaeffer",
title = "Parallel Sorting By Regular Sampling",
journal = "Journal of Parallel and Distributed Computing",
volume = "14",
number = "4",
pages = "361 - 372",
year = "1992",
}

@inproceedings{tritonsort_2011,
	title = {{TritonSort:} A Balanced {Large-Scale} Sorting System},
	booktitle = {Proceedings of {NSDI}},
	author = {A. Rasmussen and G. Porter and M. Conley and H. V Madhyastha and R. N Mysore and A. Pucher and A. Vahdat},
	year = {2011}
}

@inproceedings{quantnuma,
    author = {Kyle Spafford and Jeremy S. Meredith and Jeffrey S. Vetter},
    title = {Quantifying {NUMA} and contention effects in multi-{GPU} systems},
    booktitle = {Proceedings of The Fourth Workshop on General Purpose Processing on Graphics Processing Units},
    year = {2011},
    location = {Newport Beach, California}
    publisher = {ACM},
}

@article{Aggarwal:1988:ICS:48529.48535,
 author = {Aggarwal, Alok and Vitter, Jeffrey,S.},
 title = {The input/output complexity of sorting and related problems},
 journal = {Communications of the ACM},
 volume = {31},
 issue = {9},
 month = {September},
 year = {1988}, 
 pages = {1116--1127}, 
} 


@incollection {springerlink:10.1007/3-540-47847-7_5,
   author = {Jeon, Minsoo and Kim, Dongseung},
   title = {Parallelizing Merge Sort onto Distributed Memory Parallel Computers},
   booktitle = {High Performance Computing},
   series = {Lecture Notes in Computer Science},
   editor = {Zima, Hans and Joe, Kazuki and Sato, Mitsuhisa and Seo, Yoshiki and Shimasaki, Masaaki},
   publisher = {Springer Berlin / Heidelberg},   
   pages = {449-454},
   volume = {2327},
   year = {2006}
}

@inproceedings{intel,
 author = {Satish, Nadathur and Kim, Changkyu and Chhugani, Jatin and Nguyen, Anthony D. and Lee, Victor W. and Kim, Daehyun and Dubey, Pradeep},
 title = {Fast sort on {CPU}s and {GPU}s: a case for bandwidth oblivious {SIMD} sort},
 booktitle = {Proceedings of the 2010 International Conference on Management of Data},
 series = {SIGMOD '10},
 year = {2010},
 location = {Indianapolis, Indiana, USA},
 pages = {351--362},
} 

@article{keeneland2011,
   author = {Vetter, J.S. and Glassbrook, R. and Dongarra, J. and Schwan, K.
and Loftis, B. and McNally, S. and Meredith, J.S. and Rogers, J. and Roth, P. and Spafford, K. and Yalamanchili, S.},
   title = {Keeneland: Bringing Heterogeneous {GPU} Computing to the Computational Science Community},
   journal = {IEEE Computing in Science and Engineering},
   volume = {13},
   number = {5},
   year = {2011}

}

@misc{sortbenchmark,
    key = {sortbenchmark},
    title = {{The Annual Sort Benchmark Home Page}},
    howpublished = {\url{http://sortbenchmark.org/}},
    year = {2011}
}

@inproceedings{charmpp,
 author = {Solomonik, E. and Kale, L.},
 title = {Highly Scalable Parallel Sorting},
 booktitle = {Proceedings of the 2010 International IEEE Symposium on Parallel and Distributed Processing},
 series = {IPDPS '10},
 year = {2010},
 location = {Atlanta, GA},
 pages = {1--12},
}

@article{dca,
 author = {Meredith, Jeremy S. and Alvarez, Gonzalo and Maier, Thomas A. and Schulthess, Thomas C. and Vetter, Jeffrey S.},
 title = {Accuracy and Performance of Graphics Processors: A Quantum Monte Carlo Application Case Study},
 journal = {Parallel Computing},
 volume = {35},
 number = {3},
 year = {2009},
 issn = {0167-8191},
 pages = {151--163},
 }

@inproceedings{s3d,
 author = {Spafford, Kyle L. and Meredith, Jeremy S. and Vetter, Jeffrey S. and Chen, Jacqueline and Grout, Ray and Sankaran, Ramanan.},
 title = {Accelerating {S3D}: A {GPGPU} Case Study},
 booktitle = {HeteroPar '09: Proceedings of the Seventh International Workshop on Algorithms, Models, and Tools for Parallel Computing on Heterogeneous Platforms},
 year = {2009},
 location = {Delft, The Netherlands},
 }
 
@inproceedings{vmd,
 author = {Christopher I. Rodrigues and David J. Hardy and John E. Stone and Klaus Schulten and Wen-Mei W. Hwu},
 title = {{GPU} Acceleration of Cutoff Pair Potentials for Molecular Modeling Applications},
 booktitle = {CF '08: Proceedings of the 2008 Conference on Computing Frontiers},
 year = {2008}, 
 pages = {273--282},
 location = {Ischia, Italy}, 
 }

@inproceedings{scatter,
 author = {Bingsheng He and Naga K. Govindaraju and Qiong Luo and Burton Smith},
 title = {Efficient Gather and Scatter Operations on Graphics Processors},
 booktitle = {SC '07: Proceedings of the 2007 ACM/IEEE Conference on Supercomputing},
 year = {2007}, 
 pages = {1--12},
 location = {Reno, Nevada},
 }

@ARTICLE{osaka,
title={Faster Matrix-Vector Multiplication on {GeForce} {8800GTX}},
author={Fujimoto, N.},
journal={IEEE International Symposium on Parallel and Distributed Processing},
year={2008},
month={April},
volume={},
number={},
pages={1-8},
}

@inproceedings{spmv,
 author = {Jeff Bolz and Ian Farmer and Eitan Grinspun and Peter Schr\"{o}oder},
 title = {Sparse Matrix Solvers on the {GPU}: Conjugate Gradients and Multigrid},
 booktitle = {ACM SIGGRAPH 2003},
 year = {2003},
 pages = {917--924},
 location = {San Diego, California}, 
 }

@ARTICLE{f@h,
	AUTHOR = "J E Stone and J C Phillips and P L Freddolino and D J Hardy and L G Trabuco and K Schulten",
	TITLE = "Accelerating Molecular Modeling Applications With Graphics Processors",
	JOURNAL = "Journal of Computational Chemistry",
	VOLUME = {28},
	NUMBER = {},
	PAGES = {2618-2640},
	MONTH = "January",
	YEAR = {2005}	
}
	
@INPROCEEDINGS{ppac, 
author={Spafford, K.L. and Meredith, J.S. and Vetter, J.S.}, 
booktitle={2011 IEEE International Conference on Cluster Computing (CLUSTER)}, 
title={Quartile and Outlier Detection on Heterogeneous Clusters Using Distributed Radix Sort}, 
year={2011}, 
month={September}, 
volume={}, 
number={}, 
pages={412 - 419}, 
}

@misc{titan,
    key = {titan},
    title = {Titan Supercomputer},    
    organization = {Oak Ridge National Laboratory},
    howpublished = {\url{http://www.olcf.ornl.gov/titan/}},
    year = {2012}
}


%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%
% Below may need reformatting
%
%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%
@techreport{hpcc:dongarra2005,
    author = {Dongarra, Jack J. and Luszczek, Piotr},
    title = {Introduction to the HPCChallenge Benchmark Suite},
    institution = {Innovative Computing Laboratory, University of Tennessee-Knoxville},
    number = {ICL-UT-05-01},
    url = {http://icl.cs.utk.edu/news_pub/submissions/hpcc-challenge-intro.pdf},
    year = {2005},
}


@misc{top500list,
    key = {top500},
    title = {{Top500 Supercomputing Sites}},
    howpublished = {\url{http://www.top500.org}},
    year = {2010}
}

@misc{green500list,
    key = {green500},
    title = {{The Green500 List: Environmentally Responsible Supercomputing}},
    howpublished = {\url{http://www.green500.org}},
    year = {2010}
}

@misc{keenelandURL,
    key = {Keeneland},
    title = {{Keeneland}},
    howpublished = {\url{http://keeneland.gatech.edu}},
    year = {2011}
}

@manual{fermi,
    key = {NVIDIAFermi},
    title = {{NVIDIA's} next generation {CUDA} compute architecture: {Fermi}},
    organization = {NVIDIA},
    number = {V1.1},
    howpublished = {\url{http://www.nvidia.com/content/PDF/fermi_white_papers/NVIDIA_Fermi_Compute_Architecture_Whitepaper.pdf}}
}  

@book{CUDAbyExample,
    title = {{CUDA} by example: an introduction to general-purpose {GPU} programming},
    author = {Jason Sanders and Edward Kandrot},
    publisher = {Addison-Wesley Professional},
    edition = {1st},
    year = 2010
}

@misc{CUDAzone,
    key = {CUDAZone},
    title = {{CUDA} Zone},
    howpublished = {\url{http://www.nvidia.com/object/cuda_home_new.html}},
    year = {2011}
}

@manual{CUDARefMan,
    key = {CUDARefMan},
    title = {{NVIDIA CUDA}: Reference manual},
    organization = {NVIDIA},
    number = {Version 3.2 Beta},
    month = {Aug},
    year = {2010}
}


@manual{OpenCLSpec,
    key = {OpenCLSpec},
    title = {The {OpenCL} specification, version 1.1},
    number = {36},
    author = {Khronos OpenCL Working Group},
    editor = {Aaftab Munshi},
    organization = {Khronos},
    howpublished = {\url{http://www.khronos.org/registry/cl/specs/opencl-1.1.pdf}},
    year = {2010}
}

@manual{PTXSpec,
    key = {PTX},
    title = {{PTX}: parallel thread execution, {ISA} version 2.2},
    organization = {NVIDIA},
    month = {Oct},
    year = {2010}
}

@misc{Open64,
    key = {Open64},
    title = {{Open64}: Home},
    howpublished = {\url{http://www.open64.net}},
    year = {2011}
}

@manual{CUDAProgGuide,
    key = {CUDAProgGuide},
    title = {{NVIDIA CUDA C} programming guide},
    organization = {NVIDIA},
    number = {Version 3.2},
    month = {Nov},
    year = {2010}
}

@manual{CUDAFortran,
    key = {CUDAFortran},
    title = {{CUDA Fortran}: programming guide and reference},
    organization = {The Portland Group},
    number = {11.0},
    month = {Nov},
    year = {2010}
}

@misc{PGICUDAx86,   
    key = {PGICUDAx86},
    title = {{PGI CUDA-x86}},
    howpublished = {\url{http://www.pgroup.com/resources/cuda-x86.htm}},
    year = 2011
}

@manual{PGIAccel,
    key = {PGIAccel},
    title = {{PGI Accelerator} programming model for {Fortran} \& {C}},
    organization = {The Portland Group},
    number = {v1.3},
    month = {Nov},
    year = {2010},
    howpublished = {\url{http://www.pgroup.com/lit/whitepapers/pgi_accel_prog_model_1.3.pdf}}
}

@inproceedings{ref:ocelot-llvm,
    author = {Gregory Diamos and Andrew Kerr and Sudhakar Yalamanchili and Nathan Clark},
     title = {Ocelot: A Dynamic Compiler for Bulk-Synchronous Applications in Heterogeneous Systems},
    booktitle = {Proceedings of The Nineteenth International Conference on Parallel Architectures and Compilation Techniques},
    year = {2010},
    location = {Vienna, Austria},
    publisher = {ACM},
}

@InProceedings{LLVM:CGO04,
    author    = {Chris Lattner and Vikram Adve},
    title     = "{LLVM: A Compilation Framework for Lifelong Program Analysis \& Transformation}",
    booktitle = "{Proceedings of the 2004 International Symposium on Code Generation and Optimization (CGO'04)}",
    address   = {Palo Alto, California},
    month     = {Mar},
    year      = {2004}
}


@manual{OpenMPSpec,
    key = {OpenMP},
    title = {{OpenMP} application program interface},
    organization = {OpenMP Architecture Review Board},
    number = {3.0},
    month = {May},
    year = {2008},
    howpublished = {\url{http://www.openmp.org/mp-documents/spec30.pdf}}
}


@misc{HMPP,
    key = {HMPP},
    title = {{HMPP Workbench}},
    howpublished = {\url{http://www.caps-entreprise.com/fr/page/index.php?id=49&p_p=36}},
    year = {2011}
}


@inproceedings{Lee:OpenMPC,
 author = {Lee, Seyong and Eigenmann, Rudolf},
 title = {OpenMPC: Extended OpenMP Programming and Tuning for GPUs},
 booktitle = {Proceedings of the 2010 ACM/IEEE International Conference for High Performance Computing, Networking, Storage and Analysis},
 series = {SC '10},
 year = {2010},
 isbn = {978-1-4244-7559-9},
 pages = {1--11},
 numpages = {11},
 url = {http://dx.doi.org/10.1109/SC.2010.36},
 doi = {http://dx.doi.org/10.1109/SC.2010.36},
 acmid = {1884674},
 publisher = {IEEE Computer Society},
 address = {Washington, DC, USA},
} 

@book{gropp:mpi2,
    author = {William Gropp and Rajeev Thakur and Ewing Lusk},
    title = {Using MPI-2: Advanced Features of the Message Passing Interface},
    year = {1999},
    isbn = {026257134X},
    publisher = {MIT Press},
    address = {Cambridge, MA, USA},
}

@book{JV:Gro99a,
  author = {Gropp, William and Lusk, Ewing and Skjellum, Anthony},
  title = {Using MPI: portable parallel programming with the message-passing interface},
  publisher = {MIT Press},
  address = {Cambridge, MA},
  edition = {2nd},
  series = {Scientific and engineering computation},
  note = {99016613
William Gropp, Ewing Lusk, Anthony Skjellum.
Includes bibliographical references and index.},
  keywords = {Parallel programming (Computer science)
Parallel computers Programming.
Computer interfaces.},
  year = {1999}
}

@article{JV:Num98,
  author = {Numrich, R.W. and Reid, J.},
  title = {{Co-Array Fortran} for parallel programming},
  journal = {ACM SIGPLAN FORTRAN Forum},
  volume = {17},
  number = {1998},
  pages = {1--31},
  year = {1998}
}


@techreport{JV:Car99a,
  author = {Carlson, W.W. and Draper, J.M. and Culler, D. and Yelick, K. and Brooks, E. and Warren, K.},
  title = {Introduction to {UPC} and language specification},
  institution = {Center for Computing Sciences, IDA},
  number = {CCS-TR-99-157},
  type = {Technical Report},
  year = {1999}
}

@manual{cudagdb,
    key = {CUDAgdb},
    title = {{CUDA-GDB (NVIDIA CUDA Debugger)}},
    organization = {NVIDIA},
    month = {Oct},
    year = {2010},
    number = {DU-05227-001_V3.2}
}

@manual{TotalView,
    key = {TotalView},
    title = {{TotalView} Users Guide, Version 8.9},
    organization = {TotalView Technologies},
    year = {2010},
    howpublished = {\url{http://www.totalviewtech.com/support/documentation/pdf/TotalView8-9_user_guide.pdf}}
}

@manual{TotalViewCUDA,
    key = {TotalViewCUDA},
    title = {{TotalView} {CUDA} debugger users guide supplement},
    organization = {TotalView Technologies},
    year = {2010},
    howpublished = {\url{http://www.totalviewtech.com/support/documentation/pdf/CUDATotalViewDebuggerUsersGuideSupplement_V4.pdf}}
}

@manual{DDT,
    key = {DDT},
    title = {{DDT} user guide},
    organization = {Allinea Software},
    year = {2010},
    howpublished = {\url{http://www.allinea.com/downloads/userguide.pdf}}
}

@inproceedings{Danalis:2010:SHC:1735688.1735702,
 author = {Danalis, Anthony and Marin, Gabriel and McCurdy, Collin and Meredith, Jeremy S. and Roth, Philip C. and Spafford, Kyle and Tipparaju, Vinod and Vetter, Jeffrey S.},
 title = {The {Scalable Heterogeneous Computing (SHOC)} benchmark suite},
 booktitle = {Proceedings of the 3rd Workshop on General-Purpose Computation on Graphics Processing Units},
 series = {GPGPU '10},
 year = {2010},
 isbn = {978-1-60558-935-0},
 location = {Pittsburgh, Pennsylvania},
 pages = {63--74},
 numpages = {12},
 url = {http://doi.acm.org/10.1145/1735688.1735702},
 doi = {http://doi.acm.org/10.1145/1735688.1735702},
 acmid = {1735702},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {GPGPU, benchmarking, graphics processors, performance},
} 

@misc{pgdbgsupport,
    key = {PGDBGsupport},
    title = {{PGI} User Forums: Can anyone recommend a debugger to work with {PGF}},
    howpublished = {\url{http://www.pgroup.com/userforum/viewtopic.php?p=8677&sid=37684e1591a85b79828f190a95c51e10}},
    year = {2010}
}

@inproceedings{Malony:2010:EAP:1810085.1810105,
 author = {Malony, Allen D. and Biersdorff, Scott and Spear, Wyatt and Mayanglambam, Shangkar},
 title = {An experimental approach to performance measurement of heterogeneous parallel applications using {CUDA}},
 booktitle = {Proceedings of the 24th ACM International Conference on Supercomputing},
 series = {ICS '10},
 year = {2010},
 isbn = {978-1-4503-0018-6},
 location = {Tsukuba, Ibaraki, Japan},
 pages = {127--136},
 numpages = {10},
 url = {http://doi.acm.org/10.1145/1810085.1810105},
 doi = {http://doi.acm.org/10.1145/1810085.1810105},
 acmid = {1810105},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {GPGPU, performance tools, profiling, tracing},
} 

@article{shende:tau,
    author = {Shende, S. and Malony, A.D},
    title = {The {TAU} Parallel Performance System},
    journal = {International Journal of High Performance Computing Applications},
    volume = 20,
    number = 2,
    year = 2006,
    pages = {287--311}
}

@manual{nvidia:perfkit,
    key = {PerfKit},
    title = {{NVIDIA PerfKit: NVIDIA} performance toolkit},
    organization = {NVIDIA},
    number = {DA-01800-001_v04},
    month = {Oct},
    year = 2006,
    howpublished = {\url{http://developer.download.nvidia.com/tools/NVPerfKit/6.0/User_Guide_NVPerfKit.pdf}}
}

@misc{vampir:cuda,
    key = {VampirCUDA},
    title = {{VampirTrace} - Tracing {NVIDIA} {CUDA} applications},
    howpublished = {\url{http://tu-dresden.de/die_tu_dresden/zentrale_einrichtungen/zih/forschung/software_werkzeuge_zur_unterstuetzung_von_programmierung_und_optimierung/vampirtrace/cuda/index_html/document_view?body_language=en}},
    year = {2010}
}

@misc{nvidia:nexus,
    key = {Nexus},
    title = {{NVIDIA} {Parallel Nsight} 1.5 user guide},
    year = 2010,
    howpublished = {\url{http://http.developer.nvidia.com/ParallelNsight/UserGuide/HTML/User%20Guide.1033_en_US/webframe.html}}
}

@INPROCEEDINGS{ocelot:performance,
AUTHOR="Andrew Kerr and Gregory Diamos and Sudakhar  Yalamanchili",
TITLE="Modeling GPU-CPU Workloads and Systems",
BOOKTITLE="Third Workshop on General-Purpose Computation on Graphics Processing Units",
ADDRESS="Pittsburg, Pennsylvania, USA",
DAYS=19,
MONTH=4,
YEAR=2010
}

@inproceedings{papi,
    author = {Terpstra, D. and Jagode, H. and You, H., and Dongarra, J.},
    title = {Collecting performance data with {PAPI-C}},
    booktitle = {Tools for High Performance Computing 2009: 3rd Parallel Tools Workshop},
    location = {Dresden, Germany},
    publisher = {Springer Berlin/Heidelberg},
    pages = {157--173},
    year = 2009
}

@manual{nvidia:visprofiler,
    key = {NVIDIAProfiler},
    title = {Compute Visual Profiler},
    organization = {NVIDIA},
    year = 2010,
    month = {Oct},
    number = {DU-05162-001_v02}
}



@misc{gpudirect,   
    key = {GPUDirect},
    title = {{NVIDIA GPUDirect} technology: accelerating {GPU}-based systems},
    howpublished = {\url{http://http://www.mellanox.com/pdf/whitepapers/TB_GPU_Direct.pdf}},
    year = 2010,
    month = {May}
}

@misc{thrust,
    key = {Thrust},
    title = {QuickStartGuide - thrust - a brief tutorial for new {Thrust} developers},
    howpublished = {\url{http://code.google.com/p/thrust/wiki/QuickStartGuide#Additional_Resources}},
    year = 2011,
    month = {Feb}
}

@article{magma,
    title = {Numerical linear algebra on emerging architectures: the {PLASMA} and {MAGMA} projects},
    author = {Emmanuel Agullo and Jim Demmel and Jack Dongarra and Bilel Hadri and Jakub Kurzak and Julien Langou and Hatem Ltaief and Piotr Luszczek and Stanimire Tomov},
    journal = {Journal of Physics: Conference Series},
    volume = {180},
    year = 2009
}

@manual{jacket,
    key = {Jacket},
    title = {{Jacket} function reference},
    organization = {AccelerEyes},
    number = {V1.7},
    howpublished = {\url{http://www.accelereyes.com/content/doc/FunctionReferenceGuide.pdf}},
    year = 2011
}  

@inproceedings{quantnuma,
    author = {Kyle Spafford and Jeremy S. Meredith and Jeffrey S. Vetter},
    title = {Quantifying {NUMA} and contention effects in multi-{GPU} systems},
    booktitle = {Proceedings of The Fourth Workshop on General Purpose Processing on Graphics Processing Units},
    year = {2011},
    location = {Newport Beach, California},
    publisher = {ACM},
}

@misc{convey,
    key = {Convey},
    title = {The {Convey HC-1} computer},
    organization = {Convey Computer},
    howpublished = {\url{http://www.conveycomputer.com/Resources/ConveyArchitectureWhiteP.pdf}},
    year = 2008,
    month = {Nov}
}

@misc{fusion,
    key = {Fusion},
    title = {{AMD Fusion} family of {APUs}: enabling a superior, immersive {PC} experience},
    author = {Brookwood, Nathan},
    organization = {Advanced Micro Devices},
    year = 2010,
    month = {Mar},
    howpublished = {\url{http://sites.amd.com/us/Documents/48423B_fusion_whitepaper_WEB.pdf}}
}

@inproceedings{mic,
    title = {Petascale to exascale: extending {Intel's} {HPC} commitment},
    author = {Kirk Skaugen},
    booktitle = {ISC 2010 keynote presentation},
    howpublished = {\url{http://download.intel.com/pressroom/archive/reference/ISC_2010_Skaugen_keynote.pdf}},
    year = 2010
}

@inproceedings{Volkov:2008:BGT:1413370.1637936,
 author = {Volkov, Vasily and Demmel, James W.},
 title = {Benchmarking GPUs to tune dense linear algebra},
 booktitle = {Proceedings of the 2008 ACM/IEEE conference on Supercomputing},
 series = {SC '08},
 year = {2008},
 isbn = {978-1-4244-2835-9},
 location = {Austin, Texas},
 pages = {--11},
 url = {http://dx.doi.org/10.1145/1413370.1413402},
 doi = {http://dx.doi.org/10.1145/1413370.1413402},
 acmid = {1637936},
 publisher = {IEEE Press},
 address = {Piscataway, NJ, USA},
} 

@inproceedings{Grice:2009:RPI:1542275.1542279,
 author = {Grice, Donald},
 title = {The roadrunner project and the importance of energy efficiency on the road to exascale computing},
 booktitle = {Proceedings of the 23rd international conference on Supercomputing},
 series = {ICS '09},
 year = {2009},
 isbn = {978-1-60558-498-0},
 location = {Yorktown Heights, NY, USA},
 pages = {2--2},
 numpages = {1},
 url = {http://doi.acm.org/10.1145/1542275.1542279},
 doi = {http://doi.acm.org/10.1145/1542275.1542279},
 acmid = {1542279},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {cell broadband engine, exaflop, exascale computing, heterogeneous multicore architectures, petaflop},
} 

@misc{tianhe,
    key = {Tianhe},
    title = {Tianhe-1 {Pflop} Supercomputer},    
    organization = {National Supercomputing Center in Tianjin},
    howpublished = {\url{http://www.nscc-tj.gov.cn/en/show.asp?id=191}},
    year = {2010}
}

@INPROCEEDINGS{tsubame, 
author={Endo, T. and Nukada, A. and Matsuoka, S. and Maruyama, N.}, 
booktitle={Parallel Distributed Processing (IPDPS), 2010 IEEE International Symposium on}, title={Linpack evaluation on a supercomputer with heterogeneous accelerators}, 
year={2010}, 
month=april, 
volume={}, 
number={}, 
pages={1 -8}, 
keywords={ClearSpeed SIMD accelerators;ClearSpeed accelerators;Linpack benchmark;NVIDIA Tesla GPU;Opteron cores;TSUBAME supercomputer;Xeon cores;heterogeneous accelerators;load balancing;supercomputer evaluation;system architecture;benchmark testing;coprocessors;parallel machines;performance evaluation;resource allocation;}, 
doi={10.1109/IPDPS.2010.5470353}, 
ISSN={1530-2075},
}

@misc{dirac,
    key = {Dirac},
    title = {Dirac, {NERSC's} {GPU} testbed},
    organization = {National Energy Research Scientific Computing Center},
    howpublished = {\url{http://newweb.nersc.gov/users/computational-systems/dirac}},
    year = {2011},
}

@misc{lincoln,
    key = {Lincoln},
    organization = {National Center for Supercomputing Applications},
    title = {{NCSA} scientific computing: {Intel} 64 {Tesla} {Linux} cluster {Lincoln}},
    howpublished = {\url{http://www.ncsa.illinois.edu/UserInfo/Resources/Hardware/Intel64TeslaCluster}},
    year = {2010}
}

@inproceedings{lincolnsw,
    title = {{GPU} clusters for high-performance computing},
    author = {Volodymyr V. Kindratenko and Jeremy J. Enos and Guochun Shi and Michael T. Showerman and Galen W. Arnold and John E. Stone and James C. Phillips and Wen-mei Hwu},
    booktitle = {Proceedings of the 2009 Workshop on Parallel Programming on Accelerator Clusters (PPAC'09)},
    year = 2009,
    month = {Aug},
    location = {New Orleans, Louisiana}
}

@misc{numactl,
    key = {numactl},
    title = {An {NUMA} {API} for {Linux}},
    author = {Andi Kleen},
    howpublished = {\url{http://halobates.de/numaapi3.pdf}},
    year = 2004,
    month = {Aug}
}


@article {lammps,
	author = {Plimpton, S.},
	title = {Fast Parallel Algorithms for Short-Range Molecular Dynamics},
	journal = "Journal of Computational Physics",
	volume = "117",
	year = "March 1995",
	abstract = "<P>Three parallel algorithms for classical molecular dynamics are presented. The first assigns each processor a fixed subset of atoms; the second assigns each a fixed subset of inter-atomic forces to compute; the third assigns each a fixed spatial region. The algorithms are suitable for molecular dynamics models which can be difficult to parallelize efficiently&#151;those with short-range forces where the neighbors of each atom change rapidly. They can be implemented on any distributed-memory parallel machine which allows for message-passing of data between independently executing processors. The algorithms are tested on a standard Lennard-Jones benchmark problem for system sizes ranging from 500 to 100,000,000 atoms on several parallel supercomputers--the nCUBE 2, Intel iPSC/860 and Paragon, and Cray T3D. Comparing the results to the fastest reported vectorized Cray Y-MP and C90 algorithm shows that the current generation of parallel machines is competitive with conventional vector supercomputers even for small problems. For large problems, the spatial algorithm achieves parallel efficiencies of 90% and a 1840-node Intel Paragon performs up to 165 faster than a single Cray C9O processor. Trade-offs between the three algorithms and guidelines for adapting them to more complex molecular dynamics simulations are also discussed.<B>Copyright 1995, 1999 Academic Press</B></P>",
	pages = "1-19(0)",
	url = "http://www.ingentaconnect.com/content/ap/cp/1995/00000117/00000001/art01039"
}

@article{dcapp_gpu,
  author = {Meredith, Jeremy S. and Alvarez, Gonzalo and Maier, Thomas A. and Schulthess, Thomas C. and Vetter, Jeffrey S.},
  title = {{Accuracy and Performance of Graphics Processors: {A Quantum Monte Carlo} Application Case Study}},
  journal = {Parallel Comput.},
  volume = {35},
  number = {3},
  pages = {151-163},
  note = {1513319},
  year = {2009}
}

@article{dcapp,
  author = {Maier, T. A. and Jarrell, M. S. and Scalapino, D. J.},
  title = {Structure of the Pairing Interaction in the Two-Dimensional {Hubbard} Model},
  journal = {Physical Review Letters},
  volume = {96},
  number = {4},
  pages = {47005},
  year = {2006}
}

@article{gromacs,
    abstract = {{Molecular simulation is an extremely useful, but computationally very expensive tool for studies of chemical and biomolecular systems. Here, we present a new implementation of our molecular simulation toolkit GROMACS which now both achieves extremely high performance on single processors from algorithmic optimizations and hand-coded routines and simultaneously scales very well on parallel machines. The code encompasses a minimal-communication domain decomposition algorithm, full dynamic load balancing, a state-of-the-art parallel constraint solver, and efficient virtual site algorithms that allow removal of hydrogen atom degrees of freedom to enable integration time steps up to 5 fs for atomistic simulations also in parallel. To improve the scaling properties of the common particle mesh Ewald electrostatics algorithms, we have in addition used a Multiple-Program, Multiple-Data approach, with separate node domains responsible for direct and reciprocal space interactions. Not only does this combination of algorithms enable extremely long simulations of large systems but also it provides that simulation performance on quite modest numbers of standard cluster nodes.}},
    author = {Hess, Berk and Kutzner, Carsten and van der Spoel, David and Lindahl, Erik},
    day = {1},
    doi = {10.1021/ct700301q},
    journal = {Journal of Chemical Theory and Computation},
    keywords = {force\_field, molecular\_dynamics, parametrization},
    month = {March},
    number = {3},
    pages = {435--447},
    title = {{GROMACS} 4: Algorithms for Highly Efficient, Load-Balanced, and Scalable Molecular Simulation},
    url = {http://dx.doi.org/10.1021/ct700301q},
    volume = {4},
    year = {2008}
}

@article{openmm,
    abstract = {{We describe a complete implementation of all-atom protein molecular dynamics running entirely on a graphics processing unit (GPU), including all standard force field terms, integration, constraints, and implicit solvent. We discuss the design of our algorithms and important optimizations needed to fully take advantage of a GPU. We evaluate its performance, and show that it can be more than 700 times faster than a conventional implementation running on a single CPU core. {\copyright} 2009 Wiley Periodicals, Inc. J Comput Chem, 2009}},
    author = {Friedrichs, Mark S. and Eastman, Peter and Vaidyanathan, Vishal and Houston, Mike and Legrand, Scott and Beberg, Adam L. and Ensign, Daniel L. and Bruns, Christopher M. and Pande, Vijay S.},
    day = {30},
    doi = {10.1002/jcc.21209},
    issn = {1096-987X},
    journal = {Journal of Computational Chemistry},
    keywords = {gpus, md\_simulation},
    month = {April},
    number = {6},
    pages = {864--872},
    posted-at = {2009-02-06 18:14:37},
    priority = {2},
    title = {{Accelerating Molecular Dynamic Simulation on Graphics Processing Units}},
    url = {http://dx.doi.org/10.1002/jcc.21209},
    volume = {30},
    year = {2009}
}

@manual{gpulammps,
    title = {GPULAMMPS},
    note = {\url{http://code.google.com/p/gpulammps/}},
    year = 2010,
}


@article{namd,
	title = {Scalable molecular dynamics with {NAMD}},
	volume = {26},
	issn = {0192-8651},
	url = {http://onlinelibrary.wiley.com/doi/10.1002/jcc.20289/abstract},
	doi = {10.1002/jcc.20289},
	number = {16},
	journal = {Journal of Computational Chemistry},
	author = {James C. Phillips and Rosemary Braun and Wei Wang and James Gumbart and Emad Tajkhorshid and Elizabeth Villa and Christophe Chipot and Robert D. Skeel and Laxmikant Kale and Klaus Schulten},
	year = {2005},
	pages = {1781--1802}
}

@manual{NAMD_gpu,
    title = {GPU Acceleration of Molecular Modeling Applications},
    note = {\url{http://www.ks.uiuc.edu/Research/gpu/}},
    year = 2011
}

@article{keeneland,
   author = {Vetter, J.S. and Glassbrook, R. and Dongarra, J. and Fujimoto, R.M. and Schwan, K. and Loftis, B. and McNally, S. and Meredith, J.S. and Rogers, J. and Roth, P. and Spafford, K. and Yalamanchili, S.},
   title = {Keeneland: Bringing Heterogeneous Computing using Graphics Processors to the {NSF} Computational Science Community},
   journal = {IEEE Computing in Science and Engineering},
   volume = {13},
   number = {5},
   year = {2011}
}


@book{Pharr:2005:GGP:1406887,
 author = {Pharr, Matt and Fernando, Randima},
 title = {{GPU} gems 2: programming techniques for high-performance graphics and general-purpose computation},
 year = {2005},
 isbn = {9780321545411},
 edition = {First},
 publisher = {Addison-Wesley Professional},
} 
@article{owens_gpu,
    abstract = {{The graphics processing unit (GPU) has become an integral part of today's mainstream computing systems. Over the past six years, there has been a marked increase in the performance and capabilities of GPUs. The modern GPU is not only a powerful graphics engine but also a highly parallel programmable processor featuring peak arithmetic and memory bandwidth that substantially outpaces its CPU counterpart. The GPU's rapid increase in both programmability and capability has spawned a research community that has successfully mapped a broad range of computationally demanding, complex problems to the GPU. This effort in general-purpose computing on the GPU, also known as GPU computing, has positioned the GPU as a compelling alternative to traditional microprocessors in high-performance computer systems of the future. We describe the background, hardware, and programming model for GPU computing, summarize the state of the art in tools and techniques, and present four GPU computing successes in game physics and computational biophysics that deliver order-of-magnitude performance gains over optimized CPU applications.}},
    author = {Owens, J. D. and Houston, M. and Luebke, D. and Green, S. and Stone, J. E. and Phillips, J. C.},
    booktitle = {Proceedings of the IEEE},
    doi = {10.1109/JPROC.2008.917757},
    issn = {0018-9219},
    journal = {Proceedings of the IEEE},
    keywords = {gpu},
    month = may,
    number = {5},
    pages = {879--899},
    posted-at = {2008-08-12 00:51:59},
    priority = {4},
    title = {{GPU Computing}},
    url = {http://dx.doi.org/10.1109/JPROC.2008.917757},
    volume = {96},
    year = {2008}
}






















